start_date <- "2017-01-01"
end_date <- "2019-12-31"
f1<-function(d2, d1){
n_weeks <- floor(as.numeric(difftime(d2, d1, units="weeks")))
}
f2<-function(d2, d1){
n_weeks <- floor(as.numeric(difftime(as.Date(d2)
, as.Date(d1), units = "weeks")))
}
m1<-microbenchmark(
Nocast = f1(end_date, start_date),
Cast = f2(end_date, start_date),
times = 1000
)
print(m1)
## Unit: microseconds
## expr min lq mean median uq max neval
## Nocast 383.726 419.1025 439.1127 421.8325 431.842 4003.758 1000
## Cast 126.918 138.2880 148.6586 139.5460 141.855 3783.847 1000
fbox_plot(m1, "microseconds")
no_size <- function (n){
x <- c()
for (i in seq(n)) {
x <- c(x, i)
}
}
explicit_size <- function (n){
x <- vector("integer", n)
for (i in seq(n)) {
x[i] <- i
}
}
m3 <- microbenchmark(
no_size = no_size(1e4),
explicit_size = explicit_size(1e4),
times = 10
)
print(m3)
## Unit: microseconds
## expr min lq mean median uq max
## no_size 71674.568 72094.832 78374.9607 72870.72 78238.637 105658.459
## explicit_size 365.022 367.016 706.5107 382.76 409.285 3584.815
## neval
## 10
## 10
fbox_plot(m3, "microseconds")
vector <- runif(1e8)
w1 <- function(x){
d <- length(which(x > .5))
}
w2 <- function(x){
d <- sum(x > .5)
}
m4 <- microbenchmark(
which = w1(vector),
nowhich = w2(vector),
times = 10
)
print(m4)
## Unit: milliseconds
## expr min lq mean median uq max neval
## which 611.8463 614.1064 638.7722 615.8948 620.5165 735.8710 10
## nowhich 216.6689 217.8526 233.3565 222.2152 222.9557 332.0488 10
fbox_plot(m4, "miliseconds")
n <- 1e4
dt <- data.table(
a = seq(n), b = runif(n)
)
v1 <- function(dt){
d <- mean(dt[dt$b > .5, ]$a)
}
v2 <- function(dt){
d <- mean(dt$a[dt$b > .5])
}
m5 <- microbenchmark(
row_operation = v1(dt),
column_operation = v2(dt),
times = 10
)
print(m5)
## Unit: microseconds
## expr min lq mean median uq max neval
## row_operation 214.801 221.204 1002.6255 233.5710 245.498 5472.771 10
## column_operation 79.830 91.671 310.4688 99.7055 131.165 2143.754 10
fbox_plot(m5, "microseconds")
The function seq prevents when the second part of the 1:x is zero
num <- 1e7
s1 <- function(num){
d <- mean(1:num)
}
s2 <- function(num){
d <- mean(seq(num))
}
m6<-microbenchmark(
noseq = s1(num),
seq = s2(num),
times = 30
)
print(m6)
## Unit: milliseconds
## expr min lq mean median uq max neval
## noseq 69.15290 69.28184 69.43360 69.3752 69.43299 71.15839 30
## seq 69.19934 69.34238 69.55579 69.3735 69.51002 71.40295 30
fbox_plot(m6, "miliseconds")
large_dataset <- data.table(
id = 1:1000000,
value = sample(letters, 1000000, replace = TRUE)
)
a1 <- function(x){
d <- x |> mutate(code = paste0(id, "_", value))
}
a2 <- function(x){
d <- x |> mutate(code = glue("{id}_{value}"))
}
m7 <- microbenchmark(
with_paste = a1(large_dataset),
with_glue = a2(large_dataset),
times = 20
)
print(m7)
## Unit: milliseconds
## expr min lq mean median uq max neval
## with_paste 558.4607 562.6357 601.5903 566.0397 569.4206 1274.7545 20
## with_glue 587.0184 588.4760 593.4911 592.0364 595.1723 622.0061 20
fbox_plot(m7, "miliseconds")
# Create a large list
big_list <- replicate(1e5, rnorm(10), simplify = FALSE)
m8 <- microbenchmark(
lapply = lapply(big_list, mean),
for_loop = {
result <- list()
for (i in seq_along(big_list)) {
result[[i]] <- mean(big_list[[i]])
}
},
times = 10
)
print(m8)
## Unit: milliseconds
## expr min lq mean median uq max neval
## lapply 321.5016 336.1584 349.1802 348.8675 354.9164 380.3998 10
## for_loop 350.8892 366.8028 403.4082 368.8694 389.5892 648.0231 10
fbox_plot(m8, "miliseconds")
dt <- data.table(
Date = as.Date('2023-01-01') + 0:99999,
iDate = as.IDate('2023-01-01') + 0:99999,
Value = rnorm(100000)
)
nd1 <- as.Date('2023-01-01')
nd2 <- as.Date('2023-01-10')
id1 <- as.IDate('2023-01-01')
id2 <- as.IDate('2023-01-10')
date_between_operation <- function(nd1, nd2) {
dt |> filter(Date >= nd1 & Date <= nd2)
}
idate_between_operation <- function(id1, id2) {
dt |> _[data.table::between(iDate, id1, id2)]
}
m9 <- microbenchmark(
Date = date_between_operation(nd1, nd2),
iDate = idate_between_operation(id1, id2),
times = 200L
)
print(m9)
## Unit: microseconds
## expr min lq mean median uq max neval
## Date 1367.675 1608.0435 1966.2702 1982.592 2206.587 4162.645 200
## iDate 513.229 586.5605 727.7369 652.975 850.413 2395.735 200
fbox_plot(m9, "miliseconds")
switch_function <- function(x) {
switch(x,
"a" = "apple",
"b" = "banana",
"c" = "cherry",
"default")
}
case_when_function <- function(x) {
case_when(
x == "a" ~ "apple",
x == "b" ~ "banana",
x == "c" ~ "cherry",
TRUE ~ "default"
)
}
# Create a vector of test values
test_values <- sample(c("a", "b", "c", "d"), 1000, replace = TRUE)
m10 <- microbenchmark(
switch = sapply(test_values, switch_function),
case_when = sapply(test_values, case_when_function),
times = 200L
)
print(m10)
## Unit: microseconds
## expr min lq mean median uq max
## switch 602.806 659.527 685.6361 668.0125 685.9265 2206.742
## case_when 218507.740 232101.599 238316.3616 238497.3895 241205.5430 417453.563
## neval
## 200
## 200
fbox_plot(m10, "microseconds")
set.seed(123)
n <- 1e6
data <- data.table(
id = seq(n),
value = sample(seq(100), n, replace = TRUE)
)
casewhenf <- function(data){
df <- data |>
mutate(category = case_when(
value <= 20 ~ "Low",
value <= 70 ~ "Medium",
value > 70 ~ "High"))
}
fcasef <- function(data){
df <- data |>
mutate(category = fcase(
value <= 20, "Low",
value <= 70, "Medium",
value > 70, "High"))
}
m11 <- microbenchmark(
case_when = casewhenf(data),
fcase = fcasef(data),
times = 20
)
print(m11)
## Unit: milliseconds
## expr min lq mean median uq max neval
## case_when 54.28990 59.28602 63.74796 63.49637 67.93778 72.99883 20
## fcase 19.82819 20.79428 22.17237 21.68749 22.92994 27.45743 20
fbox_plot(m11, "miliseconds")
set.seed(123)
DT <- data.table(
ID = 1:1e6,
Value1 = sample(c(NA, 1:100), 1e6, replace = TRUE),
Value2 = sample(c(NA, 101:200), 1e6, replace = TRUE)
)
# Define the functions
replace_na_f <- function(data){
DF <- data |>
mutate(Value1 = replace_na(Value1, 0),
Value2 = replace_na(Value2, 0)) |>
as.data.table()
}
fcoalesce_f <- function(data){
DF <- data |>
mutate(Value1 = fcoalesce(Value1, 0L),
Value2 = fcoalesce(Value2, 0L))
}
m12 <- microbenchmark(
treplace_na = replace_na_f(DT),
tfcoalesce = fcoalesce_f(DT),
times = 20
)
print(m12)
## Unit: milliseconds
## expr min lq mean median uq max neval
## treplace_na 7.169631 7.391961 8.672590 7.811705 10.205576 12.617687 20
## tfcoalesce 1.528014 1.602102 2.165468 2.045485 2.444476 4.726348 20
fbox_plot(m12, "miliseconds")
dt <- data.table(field_name = c("argentina.blue.man.watch",
"brazil.red.woman.shoes",
"canada.green.kid.hat",
"denmark.red.man.shirt"))
# Filter rows where 'field_name' does not contain 'red'
dtnot <- function(data){
filtered_dt <- data |> _[!grepl("red", field_name)]
}
anonymousnot <- function(data){
filtered_dt <- data |> (\(dt) dt[!grepl("red", dt$field_name), ])()
}
dplyrnot <- function(data){
filtered_dt <- data |> filter(!grepl("red", field_name))
}
m13 <- microbenchmark(
anonymous_not = anonymousnot(dt),
data_table_not = dtnot(dt),
dplyr_not = dplyrnot(dt),
times = 100
)
print(m13)
## Unit: microseconds
## expr min lq mean median uq max neval
## anonymous_not 105.958 111.8695 157.1831 122.0885 144.7060 3107.734 100
## data_table_not 103.183 108.4070 142.3854 118.9920 139.6160 1898.757 100
## dplyr_not 684.649 713.8085 759.9004 726.6820 744.4805 2975.568 100
fbox_plot(m13, "microseconds")
large_data <- data.table(
id = 1:100000,
var1 = rnorm(100000),
var2 = rnorm(100000),
var3 = rnorm(100000),
var4 = rnorm(100000)
)
# Benchmarking
m14 <- microbenchmark(
tidyr_pivot_longer = {
long_data_tidyr <- pivot_longer(large_data, cols = starts_with("var"),
names_to = "variable", values_to = "value")
},
data_table_melt = {
long_data_dt <- melt(large_data, id.vars = "id", variable.name = "variable",
value.name = "value")
},
times = 10
)
print(m14)
## Unit: microseconds
## expr min lq mean median uq max
## tidyr_pivot_longer 6448.133 6574.229 8775.3237 6812.5635 6962.845 26783.203
## data_table_melt 444.881 474.647 555.0589 529.7945 635.467 727.048
## neval
## 10
## 10
fbox_plot(m14, "microseconds")
vec1 <- seq(1000)
vec2 <- seq(1000)
# Define functions to be benchmarked
expand_grid_func <- function() {
return(expand_grid(vec1, vec2))
}
CJ_func <- function() {
return(CJ(vec1, vec2))
}
# Perform benchmarking
m15 <- microbenchmark(
expand_grid = expand_grid_func(),
CJ = CJ_func(),
times = 10
)
print(m15)
## Unit: microseconds
## expr min lq mean median uq max neval
## expand_grid 2197.134 2213.805 2539.9229 2289.1205 2328.710 3829.502 10
## CJ 475.828 486.258 649.3227 489.0785 509.942 1776.739 10
fbox_plot(m15, "microseconds")
# Sample data
size = 1e4
set.seed(44)
df_list <- replicate(50, data.table(id = sample(seq(size), size, replace = T),
value = rnorm(size)), simplify = F)
simple_bind <- function(list_of_dfs){
do.call(rbind, list_of_dfs)
}
dplyr_bind <- function(list_of_dfs){
bind_rows(list_of_dfs)
}
dt_bind <- function(list_of_dfs){
rbindlist(list_of_dfs, fill = F)
}
# Benchmark both methods
m16 <- microbenchmark(
dt_ver = dt_bind(df_list),
simple = simple_bind(df_list),
dplyr_ver = dplyr_bind(df_list),
times = 30
)
print(m16)
## Unit: microseconds
## expr min lq mean median uq max neval
## dt_ver 468.646 490.977 597.0125 546.6915 569.704 2103.399 30
## simple 471.561 510.464 608.2575 547.9030 617.403 2015.835 30
## dplyr_ver 10344.651 10570.302 10759.8372 10678.8295 10853.681 11991.677 30
fbox_plot(m16, "microseconds")
set.seed(123)
n <- 1e4
df <- data.table(text = paste("word1", "word2", "word3", "word4", "word5", sep = "."), stringsAsFactors = F)
df <- df[rep(1, n), , drop = F]
# Using tidyr::separate
separate_words <- function() {
df |>
separate(text, into = c("w1", "w2", "w3", "w4", "w5"), sep = "\\.", remove = F) |>
select(-c(w1, w2, w4))
}
# Using stringr::word
stringr_words <- function() {
df |>
mutate(
w3 = word(text, 3, sep = fixed(".")),
w5 = word(text, 5, sep = fixed("."))
)
}
datatable_words <- function() {
df |> _[, c("w3", "w5") := tstrsplit(text, "\\.")[c(3, 5)]]
}
m17 <- microbenchmark(
separate = separate_words(),
stringr = stringr_words(),
dt = datatable_words(),
times = 10
)
print(m17)
## Unit: milliseconds
## expr min lq mean median uq max neval
## separate 77.55017 78.71027 83.93366 80.09989 91.54156 95.72738 10
## stringr 176.15070 178.83882 182.15315 181.00322 184.41627 194.52496 10
## dt 12.80430 12.97534 13.40331 13.02553 13.65529 15.74265 10
fbox_plot(m17, "miliseconds")
# Sample data
set.seed(123)
n <- 1e6
df <- data.table(
x = rnorm(n),
y = sample(c(NA, 1:100), n, replace = TRUE),
z = sample(c(NA, letters), n, replace = TRUE),
stringsAsFactors = F
)
# Benchmark both methods
m18 <- microbenchmark(
dplyr_drop_na = {
df |> drop_na()
},
data_table_na_omit = {
dt |> na.omit()
},
times = 10
)
print(m18)
## Unit: microseconds
## expr min lq mean median uq max
## dplyr_drop_na 9678.687 9713.142 9817.8071 9726.5720 9750.902 10600.92
## data_table_na_omit 9.097 10.149 52.5112 61.7805 65.682 165.60
## neval
## 10
## 10
fbox_plot(m18, "microseconds")
# Sample data
set.seed(123)
size = 1e4
n_cores = parallelly::availableCores()
df_list <- replicate(100, data.table(id = sample(seq(size), size, replace = T),
value = rnorm(size)), simplify = F)
extra_df <- data.table(id = sample(seq(size), size, replace = T),
extra_value = runif(size))
# Sequential join
sequential_join <- function() {
lapply(df_list, function(df) {
merge(df, extra_df, by = "id", allow.cartesian = T)
})
}
# Parallel join using mclapply
parallel_join <- function() {
mclapply(df_list, function(df) {
merge(df, extra_df, by = "id", allow.cartesian = T)
}, mc.cores = n_cores, mc.silent = T, mc.cleanup = T)
}
# Benchmark both methods
m19 <- microbenchmark(
sequential = sequential_join(),
parallel = parallel_join(),
times = 10
)
print(m19)
## Unit: milliseconds
## expr min lq mean median uq max neval
## sequential 301.5262 320.5787 357.6035 337.8800 387.5695 495.8454 10
## parallel 155.6060 182.4350 190.2782 193.4453 198.0251 206.7770 10
fbox_plot(m19, "miliseconds")
This is another alternative (You need to install this package)
set.seed(123)
n <- 1e7
df <- data.table(
group1 = sample(LETTERS[1:10], n, replace = TRUE),
group2 = sample(letters[1:5], n, replace = TRUE),
value1 = rnorm(n),
value2 = runif(n, 1, 100)
)
m21 <- microbenchmark(
basic_way = {
dplyr <- df |>
filter(value1 > 0) |>
mutate(ratio = value1 / value2) |>
summarize(
mean_val1 = mean(value1),
sd_val1 = sd(value1),
median_val2 = median(value2),
max_ratio = max(ratio), .by = c("group1", "group2")) |>
as.data.table()
},
dtplyr_way = {
dtplyr = df |>
lazy_dt() |>
filter(value1 > 0) |>
mutate(ratio = value1 / value2) |>
summarize(
mean_val1 = mean(value1),
sd_val1 = sd(value1),
median_val2 = median(value2),
max_ratio = max(ratio), .by = c("group1", "group2")) |>
as.data.table()
},
times = 5
)
print(m21)
## Unit: milliseconds
## expr min lq mean median uq max neval
## basic_way 581.8151 594.4181 625.3249 612.0199 616.2274 722.1441 5
## dtplyr_way 374.1814 408.1085 420.1764 415.0208 419.4515 484.1197 5
fbox_plot(m21, "miliseconds")
Parquet files may take longer if you partitioned by day. Consider to try partitioning by year or try to use duckdb. Another advantage using duckdb is memory consumption since you can wrangle using SQL statement.
Note: DuckDB requires to open a connection, consider the parameter read_only if you only want to get data. Don’t forget to close the connection.
# partitioned_by_day <- "/conf/posit_azure_logs/data/merge_uip_data_test"
partitioned_by_year <- "/conf/posit_azure_logs/test_290825/data/merge_uip_data_test"
my_duckdb <- "/conf/posit_azure_logs/test_290825/data/my_db.duckdb"
with_parquet <- function(folder_path){
data_1 <- open_dataset(file.path(folder_path)) |>
select(
ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
ALL_WIP_CP_node_total, ALL_WIP_BP_node_total
) |>
mutate(
computepool_node_mem = ALL_WIP_CP_node_total * (160 * 1024),
bigpool_node_mem = ALL_WIP_BP_node_total * (256 * 1024),
ALL_WIP_day_session = ALL_WIP_CP_day_session + ALL_WIP_BP_day_session,
ALL_WIP_night_session = ALL_WIP_CP_night_session + ALL_WIP_BP_night_session,
ALL_WIP_node_total = ALL_WIP_CP_node_total + ALL_WIP_BP_node_total,
total_mem_limit = ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit,
total_mem_request = ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request,
total_mem_max = ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max,
total_node_mem = computepool_node_mem + bigpool_node_mem,
average_session_per_node = ifelse(ALL_WIP_node_total != 0,
(ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total, 0)
) |>
arrange(ALL_WIP_CP_day_session, ALL_WIP_CP_night_session, ALL_WIP_BP_day_session) |>
collect() |>
as.data.table()
}
with_duckfile <- function(my_path){
my_connection = dbConnect(duckdb::duckdb(), dbdir = my_path, read_only=TRUE)
data_2 <- res_duckdb_sql <- dbGetQuery(
my_connection,
statement = "select
ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
ALL_WIP_CP_node_total, ALL_WIP_BP_node_total,
ALL_WIP_CP_node_total * 160 * 1024 as computepool_node_mem,
ALL_WIP_BP_node_total * 256 * 1024 as bigpool_node_mem,
ALL_WIP_CP_day_session + ALL_WIP_BP_day_session as ALL_WIP_day_session,
ALL_WIP_CP_night_session + ALL_WIP_BP_night_session as ALL_WIP_night_session,
ALL_WIP_CP_node_total + ALL_WIP_BP_node_total as ALL_WIP_node_total,
ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit as total_mem_limit,
ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request as total_mem_request,
ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max as total_mem_max,
computepool_node_mem + bigpool_node_mem as total_node_mem,
CASE
WHEN ALL_WIP_node_total != 0 THEN (ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total
ELSE 0
END AS average_session_per_node
from mytable order by ALL_WIP_CP_day_session, ALL_WIP_CP_night_session, ALL_WIP_BP_day_session",
immediate = TRUE) |>
as.data.table()
dbDisconnect(my_connection, shutdown = TRUE)
}
m22 <- microbenchmark(
duckdb_file = with_duckfile(my_duckdb),
parquet_by_year = with_parquet(partitioned_by_year),
times = 2
)
print(m22)
## Unit: milliseconds
## expr min lq mean median uq max neval
## duckdb_file 496.7932 496.7932 606.4560 606.4560 716.1188 716.1188 2
## parquet_by_year 576.0924 576.0924 703.0102 703.0102 829.9280 829.9280 2
fbox_plot(m22, "miliseconds")